The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
Object recognition
The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
● All the features are geometric features extracted from the silhouette. ● All are numeric in nature.
● Exploratory Data Analysis ● Reduce number dimensions in the dataset with minimal information loss ● Train a model using Principal Components
Apply dimensionality reduction technique – PCA and train a model using principal components instead of training the model using raw data.
# Numerical libraries
import numpy as np
# to handle data in form of rows and columns
import pandas as pd
# importing ploting libraries
import matplotlib.pyplot as plt
#importing seaborn for statistical plots
import seaborn as sns
vehicle_df = pd.read_csv('vehicle.csv')
vehicle_df.shape
vehicle_df.info()
vehicle_df.head(10)
#Since the variable is categorical, you can use value_counts function
pd.value_counts(vehicle_df['class'])
import matplotlib.pyplot as plt
%matplotlib inline
pd.value_counts(vehicle_df["class"]).plot(kind="bar")
vehicle_df.isna().sum()
vehicle_df.dtypes
#several null values.. we will just look at the box plot now to check data shape and try and replace missing values
for column in vehicle_df.select_dtypes(include=[np.number]):
plt.figure()
box_plot = sns.boxplot(data=vehicle_df[column], orient="h")
box_plot.set(xlabel=column)
vehicle_df_na_removed = vehicle_df.fillna(vehicle_df.median())
vehicle_df_na_removed.head(10)
#Outlier removal -- we will make outliers collapse to 5th and 95th percentile
vehicle_df_na_removed.columns = ['compactness','circularity','distance_circularity','radius_ratio','pr_axis_aspect_ratio','max_length_aspect_ratio','scatter_ratio','elongatedness','pr_axis_rectangularity','max_length_rectangularity','scaled_variance','scaled_variance_1','scaled_radius_of_gyration','scaled_radius_of_gyration_1','skewness_about','skewness_about_1','skewness_about_2','hollows_ratio','class']
#found somthing at -- https://www.kaggle.com/general/24617
# a number "a" from the vector "x" is an outlier if
# a > median(x)+1.5*iqr(x) or a < median-1.5*iqr(x)
# iqr: interquantile range = third interquantile - first interquantile
def outliers(x):
return np.abs(x- x.median()) > 1.5*(x.quantile(0.75)-x.quantile(0.25))
# Replace the upper outlier(s) with the 95th percentile and the lower one(s) with the 5th percentile
def replace(x): # x is a vector
out = x[outliers(x)]
return x.replace(to_replace = [out.min(),out.max()], value = [np.percentile(x,5),np.percentile(x,95)])
vehicle_df_out_normalized = vehicle_df_na_removed.select_dtypes(include=[np.number]).apply(replace,axis=0)
#vehicle_df_out_normalized.to_csv('vehicle_df_out_removed.csv')
vehicle_df_out_normalized.shape
#boxplot cleaned data
for column in vehicle_df_out_normalized.select_dtypes(include=[np.number]):
plt.figure()
box_plot = sns.boxplot(data=vehicle_df_out_normalized[column], orient="h")
box_plot.set(xlabel=column)
a-vehicle_df_na_removed :: which is the NaN imputed (but without outliers normalized)
b-vehicle_df_out_normalized :: which is a copy of a and also with outliers normalized to 5th and 95th percentile
# Since the dimensions of the data are not really known to us, it would be wise to standardize the data using z scores before we
#go for any modelling. You can use zscore function to do this
interest_df = vehicle_df_out_normalized.copy()
from scipy.stats import zscore
interest_df_z = interest_df.apply(zscore)
interest_df_z.head()
sns.pairplot(interest_df_z, diag_kind="kde")
corr = interest_df_z.corr()
print (corr)
# plot the heatmap
sns.heatmap(corr,
xticklabels=corr.columns,
yticklabels=corr.columns)
# ref ::https://www.statsmodels.org/stable/generated/statsmodels.stats.outliers_influence.variance_inflation_factor.html
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
def variance_inflation_factors(exog_df):
'''
Parameters
----------
exog_df : dataframe, (nobs, k_vars)
design matrix with all explanatory variables, as for example used in
regression.
Returns
-------
vif : Series
variance inflation factors
'''
exog_df = add_constant(exog_df)
vifs = pd.Series(
[1 / (1. - sm.OLS(exog_df[col].values,
exog_df.loc[:, exog_df.columns != col].values).fit().rsquared)
for col in exog_df],
index=exog_df.columns,
name='VIF'
)
return vifs
df_tmp = interest_df_z.copy()
variance_inflation_factors(df_tmp)
Based on this we can see that distance_circularity, radius_ratio, scatter_ratio, elongatedness, pr_axis_rectangularity, scaled_variance, scaled_variance_1, scaled_radius_of_gyration_1, hollows_ratio all these will cause problems.
covMatrix = np.cov(interest_df_z,rowvar=False)
print(covMatrix)
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(interest_df_z)
print(pca.explained_variance_)
print(pca.components_)
print(pca.explained_variance_ratio_)
plt.bar(list(range(1,11)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,11)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cumulative of variation explained')
plt.xlabel('eigen Value')
plt.show()
# 95 Per cent qantification
print(pca.explained_variance_ratio_.cumsum())
# Lets get optimal PCA
pca_final = PCA(n_components=8)
pca_final.fit(interest_df_z)
print(pca_final.components_)
print(pca_final.explained_variance_ratio_)
interest_df_z_pca = pca_final.transform(interest_df_z)
interest_df_z_pca.shape
sns.pairplot(pd.DataFrame(interest_df_z_pca), diag_kind='kde')
y = vehicle_df['class'].values.ravel()
##Split into training and test set
from sklearn.model_selection import train_test_split
X_PCA_train, X_PCA_test, y_train, y_test = train_test_split(interest_df_z_pca, y, test_size=0.30, random_state=1)
from sklearn import svm
clf_PCA = svm.SVC(gamma=0.025, C=3)
clf_PCA.fit(X_PCA_train,y_train)
pred_train = clf_PCA.predict(X_PCA_train)
clf_PCA.score(X_PCA_train, y_train)
pred_test = clf_PCA.predict(X_PCA_test)
clf_PCA.score(X_PCA_test, y_test)
from sklearn.metrics import classification_report,confusion_matrix
mat_train = confusion_matrix(y_train,pred_train)
print("Train set confusion matrix = \n",mat_train)
mat_test = confusion_matrix(y_test,pred_test)
print("Test set confusion matrix = \n",mat_test)
from sklearn import metrics
print("SVM Metrics = \n", metrics.classification_report(y_test, pred_test))
##Split into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(interest_df_z, y, test_size=0.30, random_state=1)
from sklearn import svm
clf = svm.SVC(gamma=0.025, C=3)
clf.fit(X_train,y_train)
pred_train = clf.predict(X_train)
clf.score(X_train, y_train)
pred_test = clf.predict(X_test)
clf.score(X_test, y_test)
mat_train = confusion_matrix(y_train,pred_train)
print("Train set confusion matrix = \n",mat_train)
mat_test = confusion_matrix(y_test,pred_test)
print("Test set confusion matrix = \n",mat_test)
print("SVM Metrics = \n", metrics.classification_report(y_test, pred_test))
| Model Name | Dimension | Training Accuracy | Testing Accuracy | class | precision | recall | f1 Score | support |
|---|---|---|---|---|---|---|---|---|
| SVM - PCA | 8 | 0.9577 | 0.9409 | - | - | - | - | - |
| - | - | - | - | bus | 0.90 | 0.97 | 0.93 | 59 |
| - | - | - | - | car | 0.97 | 0.95 | 0.96 | 133 |
| - | - | - | - | van | 0.92 | 0.89 | 0.90 | 62 |
| SVM - Non PCA | 18 | 0.9797 | 0.9488 | - | - | - | - | - |
| - | - | - | - | bus | 0.89 | 0.97 | 0.93 | 59 |
| - | - | - | - | car | 0.98 | 0.96 | 0.97 | 133 |
| - | - | - | - | van | 0.93 | 0.90 | 0.92 | 62 |
# Ref :: https://medium.com/@aneesha/svm-parameter-tuning-in-scikit-learn-using-gridsearchcv-2413c02125a0
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, make_scorer
#Grid Search
from sklearn.model_selection import GridSearchCV
def svc_param_selection(X, y, nfolds):
Cs = [0.01, 0.05, 0.5, 1]
gammas = [0.001, 0.01, 0.1, 1]
kernels = ['linear', 'rbf']
param_grid = {'C': Cs, 'gamma' : gammas, 'kernel' : kernels}
# https://stackoverflow.com/questions/50752553/gridsearchcv-for-the-multi-class-svm-in-python
my_scorer = make_scorer(accuracy_score, greater_is_better=True)
grid_search = GridSearchCV(svm.SVC(), param_grid, cv=nfolds, scoring = my_scorer)
grid_search.fit(X, y)
return grid_search
grid_clf_acc = svc_param_selection(X_PCA_train, y_train,8 )
print (grid_clf_acc.best_params_)
grid_clf_acc.score(X_PCA_train, y_train)
#Predict values based on new parameters
y_pred_acc = grid_clf_acc.predict(X_PCA_test)
grid_clf_acc.score(X_PCA_test, y_test)
print("SVM Metrics = \n", metrics.classification_report(y_test, y_pred_acc))
#SVM (Grid Search) Confusion matrix
confusion_matrix(y_test,y_pred_acc)
grid_clf_acc_Non_PCA = svc_param_selection(X_train, y_train,8 )
print (grid_clf_acc_Non_PCA.best_params_)
grid_clf_acc_Non_PCA.score(X_train, y_train)
#Predict values based on new parameters
y_pred_acc = grid_clf_acc_Non_PCA.predict(X_test)
grid_clf_acc_Non_PCA.score(X_test, y_test)
print("SVM Metrics = \n", metrics.classification_report(y_test, y_pred_acc))
With SVM based grid search and both PCA & non PCA data, it is found that the ideal combination is :: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'} Note that the scorer used is for higher accuracy
| Model | Training Accuracy | Testing Accuracy | class | precision | recall | f1 Score | support |
|---|---|---|---|---|---|---|---|
| Grid SVM PCA | 0.97128 | 0.93307 | - | - | - | - | - |
| - | - | - | bus | 0.94 | 0.98 | 0.96 | 59 |
| - | - | - | car | 0.95 | 0.94 | 0.94 | 133 |
| - | - | - | van | 0.90 | 0.87 | 0.89 | 62 |
| Grid SVM Non PCA | 0.98310 | 0.93700 | - | - | - | - | - |
| - | - | - | bus | 0.89 | 0.98 | 0.94 | 59 |
| - | - | - | car | 0.98 | 0.95 | 0.96 | 133 |
| - | - | - | van | 0.90 | 0.87 | 0.89 | 62 |
#### generate the linkage matrix
from scipy.cluster.hierarchy import dendrogram, linkage
Z = linkage(interest_df_z, 'ward', metric='euclidean')
Z.shape
Z[:]
plt.figure(figsize=(25, 10))
dendrogram(Z)
plt.show()
# Hint: Use truncate_mode='lastp' attribute in dendrogram function to arrive at dendrogram (since we know 3 type of vehicles)
dendrogram(
Z,
truncate_mode='lastp', # show only the last p merged clusters
p=3, # show only the last p merged clusters
)
plt.show()
max_d = 50
from scipy.cluster.hierarchy import fcluster
clusters = fcluster(Z, max_d, criterion='distance')
clusters
#### plt.figure(figsize=(10, 8))
plt.scatter(interest_df_z['compactness'], interest_df_z['circularity'], c=clusters) # plot points with cluster dependent colors
plt.show()
# Just trying clustering for unsupervised with PCA
#### generate the linkage matrix
Z_PCA = linkage(interest_df_z_pca, 'ward', metric='euclidean')
Z_PCA.shape
Z_PCA[:]
plt.figure(figsize=(25, 10))
dendrogram(Z_PCA)
plt.show()
dendrogram(
Z_PCA,
truncate_mode='lastp', # show only the last p merged clusters
p=3, # show only the last p merged clusters
)
plt.show()
max_d = 50
from scipy.cluster.hierarchy import fcluster
clusters = fcluster(Z_PCA, max_d, criterion='distance')
clusters
#### plt.figure(figsize=(10, 8))
plt.scatter(interest_df_z_pca[:,0], interest_df_z_pca[:,1], c=clusters) # plot points with cluster dependent colors
plt.show()